import sys, os, time
import platform as plf
mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
mem_gib = mem_bytes/(1024.**3)
print('System Configuration: {}'.format(plf.platform())) #os.uname()
print('System RAM: {} GB'.format(round(mem_gib,2)))
print('Python Version: {}'.format(sys.version))
print('Current Working Directory: {}'.format(os.getcwd()))
## Getting and setting the current directory as working directory
# os.chdir(os.getcwd())
Understanding the problem & dataset
Preprocessing the data
Feature Engineering
Selecting Modeling Algorithm
Parameter Tuning through CrossValidation
Building the Model
Checking the Results
Additional steps can include
Building varied kind of model and Ensembling
Performing solid local Validation and hyper parameter tuning
Raw Data with complex relation structure --> Preprocessing --> Feature Engineering --> Model Selection --> Parameter Tuning --> Model Evaluation
## Loading Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# !pip install xgboost
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef,accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from lib import datasetPrimAnalysis, ScalingDF, plotFeatureAndProperty, visualizeFeature
from lib import plotConfusionMatrix, plotAccAndErrorWrtThreshold, DimenRed_Visual
from lib import DetNoOfClusters, DimensionTransf, PlotExplainedVar, ClusterDevelopment
from lib import VisualizeClusters, ComputingClusterEvalMetric
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
warnings.filterwarnings(module='classification*', action='ignore', category=RuntimeWarning)
seed = 12345
## Loading Data
# Train Data
df = pd.read_csv('ClassificationProblem1.txt', delimiter='\t')
print('DataFrame shape is {}'.format(df.shape))
display(df.head())
# Submission Data
sub_df = pd.read_csv('Classification1Test.txt', delimiter='\t')
print('\nDataFrame shape is {}'.format(sub_df.shape))
display(sub_df.head())
Information in column name "Index" is quite different in both the dataframe, though it's of no concern as Index column won't be playing any role as predictor here.
## Generating Feature Understanding
## datetime based feature
#'F15' & 'F16' are time based
feat_dt = ['F15', 'F16']
for f in feat_dt: df[f] = pd.to_datetime(df[f],format='%m/%d/%Y')
## Categorical Features
# 'Index' has '101180' unique values which is same as the length of df
feat_str = ['Index']
# F5, F6, F7, F8, F9 each has approx 19.8k unique values
feat_str += ['F5', 'F6', 'F7', 'F8', 'F9']
# F17 & F18 each has 5 unique values
feat_str += ['F17', 'F18']
# F19 & F20 each has 646 unique values each
feat_str += ['F19', 'F20']
# F21 & F22 each has 21 unique values each
feat_str += ['F21', 'F22']
# C has two unique value
feat_str += ['C']
# Converting feature datatype to Object
for f in feat_str: df[f] = df[f].astype(str)
df_info = datasetPrimAnalysis(df)
## Feature Engineering & Manupulation
'''
Domain knowledge is not provided with this problem statement and we are not sure of feature defination.
Hence Feature manuplation and engineering is a challenge.
Making use of feature which are time series based is of particular chalenges, as there is not information
that can be considered for the forecasting.... <todo> no trend to be generated, it's just timestamps ...
'''
## Converting Information Type
# Handling Time Based Feature
df['F15_d'] = [ ele.timestamp() / (60*60*24) for ele in df['F15'] ] ## days since reference time 1Jan1970
df['F16_d'] = [ ele.timestamp() / (60*60*24) for ele in df['F16'] ]
# Checking this new Feature (Difference o these two dates)
# df['Diff_F16_15_days'] = [ (df['F16'][i] - df['F15'][i]).days for i in range(len(df)) ]
# df['Diff_F16_15_days'] = [ df['F16'][i] - df['F15'][i] for i in range(len(df)) ]
## Converting DataTypes
# To Numeric
feat_num = ['F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12',\
'F13','F14','F15_d','F16_d','F19','F20','F21','F22']
for f in feat_num: df[f] = df[f].astype(float)
# To Object
feat_str = ['Index','F17','F18','C']
for f in feat_str: df[f] = df[f].astype(str)
## Dropping some certain features
df.drop(columns=['F15', 'F16'], inplace=True) #'Index' is valuable for sub_df hence won't be dropped at this stage
df_info = datasetPrimAnalysis(df, msg=False)
df.drop(columns=['Index'], inplace=True) ## since thiss is unnecessary in this dataframe
## Handling Missing Obsservation
missing_data = df.isnull().sum(axis=0).reset_index()
missing_data.columns = ['column_name', 'missing_count']
missing_data = missing_data.loc[missing_data['missing_count']>0]
missing_data = missing_data.sort_values(by='missing_count')
display(missing_data)
tempDF = pd.concat([df_info['Categorical'], df_info['Numerical']], sort=False)
featName = list(tempDF.index)
missingPct = list(tempDF['%Missing'])
plotFeatureAndProperty(featName, missingPct, featName, tit='%Missing')
## Missing Data interpolation
# Available methods in interpolate #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate
# method : {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’, ‘piecewise_polynomial’, ‘from_derivatives’, ‘pchip’, ‘akima’}
# sum(azdiasPart2['ALTERSKATEGORIE_GROB'].interpolate(method='linear').isna())
There is no column having any missing value. Hence, Nothing to do here
def generatePlots(df_info):
## way 1 -- using return dictionary from scaler
# featName = list(iniDescStats_dict.keys())
# mea = [ iniDescStats_dict[f]['Mean'] for f in feat ]
## way 2 -- using df_info from primary analysis
featName = list(df_info['Numerical'].index)
mea = list(df_info['Numerical']['mean'] )
plotFeatureAndProperty(featName, mea, featName, tit='Mean')
med = list(df_info['Numerical']['50%'])
plotFeatureAndProperty(featName, med, featName, tit='Median')
std = list(df_info['Numerical']['std'] )
plotFeatureAndProperty(featName, std, featName, tit='Standard Deviation')
## Standard Scaling
df_info = datasetPrimAnalysis(df, msg=False)
print('\n\t_________ _________| Before Scaling |________ _________')
generatePlots(df_info)
print('Scaling Feature')
scaler = ScalingDF(df.loc[:, df_info['Numerical'].index ])
df.loc[:, df_info['Numerical'].index ], iniDescStats_dict = scaler.standardization()
# finDescStats_dict = scaler.generateNewFeaturesDescriptiveStats()
# again checking the structure
df_info = datasetPrimAnalysis(df, msg=False)
print('\n\t________ _________| After Scaling |________ _________')
generatePlots(df_info)
colToTransform = ['F17', 'F18']
#<todo> write a custom processing class
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# le = LabelEncoder()
# for feature in nonnumeric_columns:
# data[feature] = le.fit_transform(data[feature])
# onehotencoder = OneHotEncoder(categorical_features = [1])
# X = onehotencoder.fit_transform(X).toarray()
## OneHot Encoding using a different method
df = df.join(pd.get_dummies(df.loc[:, colToTransform], prefix=None, prefix_sep='_'))
df.drop(columns=colToTransform, inplace=True)
## Reordering the columns
''' will do later -- not needed'''
## Univariate Outliers Detection
def outlier_zScore_series(ser, threshold = 3):
ser_mean, ser_std = ser.mean(), ser.std()
ser = list(ser)
if ser_std != 0:
#return [ True if abs((ele - ser_mean)/ser_std) > threshold else False for ele in ser ]
return [ i for i in range(len(ser)) if abs((ser[i] - ser_mean)/ser_std) > threshold ]
else:
raise Exception('Dividing by zero')
## <todo> IQR based
outlier_index_list, feature_having_out = [], []
df_info = datasetPrimAnalysis(df, msg=False)
print('Performing univariate outlier detection')
for feat in df_info['Numerical'].index:
detected_obs_ind = outlier_zScore_series(df[feat])
[ outlier_index_list.append(ele) for ele in detected_obs_ind ]
print('|\tFeature "{}" contains {} outlier cases.'.format(feat, len(detected_obs_ind)))
if len(detected_obs_ind) > 0:
feature_having_out.append( (feat,len(detected_obs_ind)) )
visualizeFeature(df[feat], df['C'])
# print(outlier_index_list)
There are 4 features (F21, F22, F15_d, & F16_d) which gets highlighted in univated outlier detection. This univariate outlier detection is performed by using zscore as a mean to detect oulier. If the absolute value of z-score for a feature is greater than 3 then that feature is considered a outlier.
From the plots and stats highlighted above it can be understood that because of the nature of the data we are accounting for so many data point as outliers. Additionally, based on their distributions these points are not to be considered as outliers. Hence we won't be dropping these features.
## Outlier Removal
# counting the number of feature that marked an index to be labelled as outlier
ind_dict = {} # will contain key = index & value = # of feature satisfying conditions
for ke in outlier_index_list:
if ke in ind_dict.keys():
ind_dict[ke] += 1
else:
ind_dict[ke] = 1
if len(ind_dict.keys()) < 10:
print('Indexes that have been marked as outlier and by how many features', ind_dict)
else:
indME3 = [ (key, ind_dict[key]) for key in ind_dict.keys() if ind_dict[key] >= 3 ]
indME2 = [ (key, ind_dict[key]) for key in ind_dict.keys() if ind_dict[key] == 2 ]
indME1 = [ (key, ind_dict[key]) for key in ind_dict.keys() if ind_dict[key] == 1 ]
if (len(indME2)>0) | (len(indME3)>0):
print('Indexes that have been marked as outlier:')
if len(indME3)>0:
print('|\tBy more than or equal to 3 feature :', indME3)
if len(indME2)>0:
print('|\tBecause of any 2 feature :', indME2)
#if len(indME1)>0:
# print('|\tBecause of any 1 feature :', indME1)
# # Removing index which are marked as outlier by any features
# index_to_drop = [ key for key in ind_dict.keys() if ind_dict[key] > 0 ]
# df.drop(index=index_to_drop, inplace=True)
# df.reset_index(drop=True, inplace=True)
## Multivariate Outlier Detection - Anomaly Detection
# <todo>
## generating correlation matrix
def generate_correlation_plot(DF, method='pearson'):
'''
Use to Generate Confusion Matrix and Storing the confusion matrix
method : {‘pearson’, ‘kendall’, ‘spearman’}
'''
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 19))
plt.title('Pearson Correlation of Features')
# Draw the heatmap using seaborn
sns.heatmap(DF.corr(method=method).round(2),linewidths=0.25,vmax=1.0, square=True,
cmap="YlGnBu", linecolor='black', annot=True)
plt.show()
# f.savefig(config['input']['ClustFileSavingLoc_dir'] + 'CorrelationPlot__{}.png'.format(time.time()), bbox_inches="tight")
generate_correlation_plot(df)
Highly uncorrelated features
visualizeFeature(df['F20'], df['F18_1'].astype(float))
## Critical Class
sns.countplot(x= df['C'])
plt.show()
'''Not Required'''#<todo>
# Splitting the dataset into the Training set and Test set
## Dividing the data into X and Y
x = df.loc[:, df.columns != 'C'].copy()
y = df['C']
## Changing Mapping
# level_map = {'1':'Purchased', '0':'NotPurchased'}
# y['C'].map(level_map)
## Changing Datatype of Critical Class
y = y.astype(int)
## Splitting the dataset
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)
print('length of train and test set :', len(xtrain), len(xtest) )
def dataCleaningFunction(df, predictInfo=None):
df = df.copy()
cycle = 'Train' if predictInfo == None else 'Predict'
print('\n', cycle)
## datetime based feature
feat_dt = ['F15', 'F16']
for f in feat_dt: df[f] = pd.to_datetime(df[f],format='%m/%d/%Y')
## Categorical Features
feat_str = ['Index']
feat_str += ['F5', 'F6', 'F7', 'F8', 'F9']
feat_str += ['F17', 'F18']
feat_str += ['F19', 'F20']
feat_str += ['F21', 'F22']
if cycle == 'Train': feat_str += ['C']
# Converting feature datatype to Object
for f in feat_str: df[f] = df[f].astype(str)
#df_info = datasetPrimAnalysis(df)
## Converting Information Type
df['F15_d'] = [ ele.timestamp() / (60*60*24) for ele in df['F15'] ]
df['F16_d'] = [ ele.timestamp() / (60*60*24) for ele in df['F16'] ]
# To Numeric
feat_num = ['F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12',\
'F13','F14','F15_d','F16_d','F19','F20','F21','F22']
for f in feat_num: df[f] = df[f].astype(float)
# To Object
if cycle == 'Train':
feat_str = ['Index','F17','F18','C']
else:
feat_str = ['Index','F17','F18']
for f in feat_str: df[f] = df[f].astype(str)
## Dropping some certain features
df.drop(columns=['F15', 'F16'], inplace=True) #'Index' is valuable for sub_df hence won't be dropped at this stage
df_info = datasetPrimAnalysis(df, msg=False)
if cycle == 'Train': df.drop(columns=['Index'], inplace=True)
## Handling Missing Obsservation
'''Nothing to Do'''
## Feature Scaling
if cycle == 'Train':
predictInfo = {}
scaler = ScalingDF(df.loc[:, df_info['Numerical'].index ])
df.loc[:, df_info['Numerical'].index ], predictInfo['scalerDict'] = scaler.standardization()
else:
scaler = ScalingDF(df.loc[:, df_info['Numerical'].index ], predictInfo['scalerDict'])
df.loc[:, df_info['Numerical'].index ], iniDescStats_dict = scaler.standardization()
df_info = datasetPrimAnalysis(df, msg=False)
## Dummy Feature Creation
colToTransform = ['F17', 'F18']
df = df.join(pd.get_dummies(df.loc[:, colToTransform], prefix=None, prefix_sep='_'))
df.drop(columns=colToTransform, inplace=True)
## Handling Outlier
'''Nothing to Do'''
## Class Balance & Splitting Dataset
'''Nothing to Do'''
return df, predictInfo
# cleanDF, predictInfo = dataCleaningFunction(df)
# xSubDF, predictInfo = dataCleaningFunction(sub_df, predictInfo=predictInfo)
dictToPassForPred = {}
dictToPassForPred['scalerDict'] = iniDescStats_dict
xSubDF, predictInfo = dataCleaningFunction(sub_df, predictInfo=dictToPassForPred)
xSubDF.head()
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
errors = abs(predictions - test_labels)
#mape = 100 * np.mean(errors / test_labels)
accuracy = accuracy_score(test_labels, predictions)*100# 100 - mape
mcc = matthews_corrcoef(test_labels, predictions)
# MattheawCorrelationCoefficient = round((tp*tn - fp*fn) / ((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**(1/2),3)
print('\nModel Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))
print('Mattheaw Correlation Coefficient = {:0.3f}.'.format(mcc))
return accuracy, mcc
base_model = RandomForestClassifier(criterion = 'gini',n_jobs = -1, random_state = seed)
base_model.fit(xtrain, ytrain)
print('Parameters Value :\n|\t', base_model.get_params())
print('_'*25+'On Test Data'+'_'*25)
base_accuracy, base_mcc = evaluate(base_model, xtest, ytest)
plotConfusionMatrix(y_act=ytest, y_pred=base_model.predict(xtest))
# Create the parameter grid based on the results of random search
param_grid = {
'criterion': ['gini'], #, 'entropy'
'bootstrap': [True],
'max_depth': [None], #120(highlighted), 130
'max_features': ['auto'], #4(highlighted)
'min_samples_leaf': [1], #6(highlighted), 8
'min_samples_split': [2], #17(highlighted), 18
'n_estimators': [2, 4, 6, 8, 10] #100(highlighted)
## default: 'bootstrap': True, 'class_weight': None, 'criterion': 'gini',
## 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None,
## 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1,
## 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10,
## 'n_jobs': -1, 'oob_score': False, 'random_state': 12345, 'verbose': 0,
## 'warm_start': False
}
# Create a based model
rfc = RandomForestClassifier(n_jobs = -1, random_state = seed)
# Instantiate the grid search model
# ref:https://scikit-learn.org/stable/modules/model_evaluation.html #,scoring='f1'
grid_search_rf = GridSearchCV(estimator = rfc, param_grid = param_grid, scoring='roc_auc',
cv = 3, n_jobs = -1, verbose = 1, return_train_score=True)
# Fit the grid search to the data
grid_search_rf.fit(xtrain, ytrain)
## grid_search.grid_scores_ ----> grid_search.cv_results_
print('Train Score')
print(grid_search_rf.cv_results_['mean_train_score'])
print(grid_search_rf.cv_results_['std_train_score'])
print('Test Score')
print(grid_search_rf.cv_results_['mean_test_score'])
print(grid_search_rf.cv_results_['std_test_score'])
# Best params
best_parameters_rf, score_rf, _ = max(grid_search_rf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score_rf)
for param_name in sorted(best_parameters_rf.keys()):
print("%s: %r" % (param_name, best_parameters_rf[param_name]))
print('\nBest Parameter Combination: \n|\t',grid_search_rf.best_params_)
## Evaluating
print('_'*25+'On Test Data'+'_'*25)
best_grid = grid_search_rf.best_estimator_
grid_accuracy, grid_mcc = evaluate(best_grid, xtest, ytest)
plotConfusionMatrix(y_act=ytest, y_pred=best_grid.predict(xtest))
print('\nImprovement in Accuracy is {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))
print('Improvement in Mattheaw Correlation Coefficient is {:0.2f}%.'.format( 100 * (grid_mcc - base_mcc) / base_mcc))
Even Though Accuracy and MCC has reduce still we have been able to increase the TP
# ## DataTransformation
# dtrain = xgb.DMatrix(data = xtrain, label = ytrain)
# dtest = xgb.DMatrix(data = xtest, label = ytest)
# param = {
# 'max_depth': 3, # the maximum depth of each tree
# 'eta': 0.3, # the training step for each iteration
# 'silent': 1, # logging mode - quiet
# 'objective': 'multi:softprob', # error evaluation for multiclass training
# 'num_class': 3} # the number of classes that exist in this datset
# num_round = 20 # the number of training iterations
# base_model = xgb.train(param, dtrain, num_boost_round=num_round)
# ## Using Optimized Parameters
# our_params = { 'objective': 'binary:logistic'}
# # Grid Search CV optimized settings
# cv_xgb = xgb.cv(params = our_params, dtrain = dtrain, num_boost_round = 3000, nfold = 5,
# metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
# early_stopping_rounds = 100, verbose_eval=True)
# cv_xgb.loc[cv_xgb["test-error-mean"] == min(cv_xgb["test-error-mean"])]
base_model = xgb.XGBClassifier(n_jobs = -1, random_state = seed, objective='binary:logistic')
base_model.fit(xtrain, ytrain)
print('Parameters Value :\n|\t', base_model.get_params())
print('_'*25+'On Test Data'+'_'*25)
base_accuracy, base_mcc = evaluate(base_model, xtest, ytest)
plotConfusionMatrix(y_act=ytest, y_pred=base_model.predict(xtest))
ind_params = {
'objective': 'binary:logistic'
#'learning_rate': 1,
#'max_depth': 6,
#'n_estimators': 100,
#'gamma': 0, #[0, 1], #[0, 1, 2],
#'min_child_weight': 1, #[0.5, 1] #[0.5, 1, 1.5]
#'subsample': 0.7,
#'colsample_bytree': 0.5
}
cv_params = {
'learning_rate': [0.1, 0.2],
'max_depth': [4, 5, 6],
'subsample': [1],
'colsample_bytree': [1],
}
'''
Default
max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic',
booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs
'''
# Create a based model
xgbC = xgb.XGBClassifier(**ind_params)
# Instantiate the grid search model
#SKF = StratifiedKFold(n_splits = 2, shuffle=True)
#SKFold = SKF.split(X_train, y_train)
grid_search_xgb = GridSearchCV(estimator = xgbC, param_grid = cv_params, scoring='roc_auc', n_jobs=-1, verbose=1)
#cv = SKFold, scoring='roc_auc', refit=True
# Fit the grid search to the data
grid_search_xgb.fit(xtrain, ytrain)
# Best params
best_parameters_xgb, score_xgb, _ = max(grid_search_xgb.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score_xgb)
for param_name in sorted(best_parameters_xgb.keys()):
print("%s: %r" % (param_name, best_parameters_xgb[param_name]))
print('\nBest Parameter Combination: \n|\t',grid_search_xgb.best_params_)
## Evaluating
print('_'*25+'On Test Data'+'_'*25)
best_grid_xgb = grid_search_xgb.best_estimator_
grid_accuracy, grid_mcc = evaluate(best_grid_xgb, xtest, ytest)
plotConfusionMatrix(y_act=ytest, y_pred=best_grid_xgb.predict(xtest))
print('\nImprovement in Accuracy is {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))
print('Improvement in Mattheaw Correlation Coefficient is {:0.2f}%.'.format( 100 * (grid_mcc - base_mcc) / base_mcc))
# Fitting XGBoost to the Training set
#xgb.cv(params = xgb_params, dtrain = dtrain, num_boost_round = NumRound, nfold=3)
Using XGB we can see that there's improvement in both Accuracy and Matthew correlation coefficient
This is also an important part, as this helps in understanding the role and iimportance of features.
rfc = RandomForestClassifier(criterion = 'gini',n_jobs = -1, random_state = seed) #entropy
featTrain = xtrain.columns.values
rfc.fit(xtrain, ytrain)
## plot the importances ##
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
## Showing top 50 features
indices = np.argsort(importances)[:50] #'1234'[::-1] =>>'4321'
plt.figure(figsize=(15, int(len(indices)*0.30)))
plt.barh(range(len(indices)), importances[indices], xerr=std, color="r", align="center")
plt.yticks(range(len(indices)), featTrain[indices], rotation='horizontal')
plt.ylim([-1, len(indices)])
plt.title('Computed feature importances via RandomForest using {} metric'.format(rfc.criterion), fontsize=14)
# plt.gca().invert_yaxis()
plt.show()
## Plot confusion Matrix
## On Training Data
print('_'*25+'On Training Data'+'_'*25)
display(pd.DataFrame(ytrain.value_counts()).reset_index().rename(columns={'index':'class', 'C':'count'}))
y_pred = rfc.predict(xtrain)
plotConfusionMatrix(y_act=ytrain, y_pred=y_pred)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
display(pd.DataFrame(ytest.value_counts()).reset_index().rename(columns={'index':'class', 'C':'count'}))
y_pred = rfc.predict(xtest)
plotConfusionMatrix(y_act=ytest, y_pred=y_pred)
# cm = confusion_matrix(ytest, predicted)
# tempDF = pd.DataFrame(cm, columns = ["Human", "Bot"])
# tempDF.index = ["Actually_Human", "Actually_Bot"]
# print("\n\t\t Predicted\n", tempDF)
# plt.matshow(cm)
# plt.colorbar()
Random Forest is a Deterministic Algorithm Model is not Good (Overfit)/ Input Data is not good
rfc = RandomForestClassifier(criterion = 'entropy',n_jobs = -1, random_state = seed)
featTrain = xtrain.columns.values
rfc.fit(xtrain, ytrain)
## plot the importances
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
## Showing top 50 features
indices = np.argsort(importances)[:50] #'1234'[::-1] =>>'4321'
plt.figure(figsize=(15, int(len(indices)*0.30)))
plt.barh(range(len(indices)), importances[indices], xerr=std, color="r", align="center")
plt.yticks(range(len(indices)), featTrain[indices], rotation='horizontal')
plt.ylim([-1, len(indices)])
plt.title('Computed feature importances via RandomForest using {} metric'.format(rfc.criterion), fontsize=14)
# plt.gca().invert_yaxis()
plt.show()
## On Training Data
print('_'*25+'On Training Data'+'_'*25)
display(pd.DataFrame(ytrain.value_counts()).reset_index().rename(columns={'index':'class', 'C':'count'}))
y_pred = rfc.predict(xtrain)
plotConfusionMatrix(y_act=ytrain, y_pred=y_pred)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
display(pd.DataFrame(ytest.value_counts()).reset_index().rename(columns={'index':'class', 'C':'count'}))
y_pred = rfc.predict(xtest)
plotConfusionMatrix(y_act=ytest, y_pred=y_pred)
# cm = confusion_matrix(ytest, predicted)
# tempDF = pd.DataFrame(cm, columns = ["Human", "Bot"])
# tempDF.index = ["Actually_Human", "Actually_Bot"]
# print("\n\t\t Predicted\n", tempDF)
# plt.matshow(cm)
# plt.colorbar()
Random Forest is a Deterministic Algorithm Model is not Good (Overfit)/ Input Data is not good
params = best_parameters_xgb ## rest can be default
xgbC = xgb.XGBClassifier(**params, random_state=seed, n_jobs=-1)
xgbC.fit(xtrain, ytrain)
## plot the importances
ax = xgb.plot_importance(xgbC)
fig = ax.figure
fig.set_size_inches(15, int(len(xtrain.columns.values)*0.30))
print('Computed feature importances via XGBoost using {} metric'.format('F Score'))
# #!pip install graphviz
# import graphviz
# import os
# os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
# xgb.plot_tree(FinalXGB_Mod, num_trees=2)
# fig = plt.gcf()
# fig.set_size_inches(350, 80)
# fig.savefig('tree.png')
# xgb.to_graphviz(FinalXGB_Mod, num_trees=2)
## Training the Model
params = best_parameters_xgb ## rest can be default
xgbC = xgb.XGBClassifier(**params, random_state=seed, n_jobs=-1)
xgbC.fit(xtrain, ytrain)
## Testing the Model
# Deterministic Prediction
# ypred = xgbC.predict(xtest)
# plotConfusionMatrix(y_act=ytest, y_pred=ypred)
# Stochastic Prediction
ypred = xgbC.predict_proba(xtest)
ypred = [ ypred[i][1] for i in range(len(ypred)) ]
# Understanding How Accuracy Changes via changing threshold
threshold_best_mcc = plotAccAndErrorWrtThreshold(ytest, ypred)
print('Best Predictive power of the model is when the threshold is around {}'.format(threshold_best_mcc))
print('Though the accuracy will be low at this threshold ~ 55%')
# Making the Confusion Matrix
i, j, step, ClassThres = 0.05, 0.3, 0.05, []
while i <= j: ClassThres.append(round(i,3)); i += step
print('_'*25+'On Test Data'+'_'*25)
for thresh in ClassThres:
print('_'*15+'When thershold is {}'.format(thresh)+'_'*15)
predicted = np.copy(ypred)
predicted = [ 1 if ele > thresh else 0 for ele in predicted ]
plotConfusionMatrix(y_act=ytest, y_pred=predicted)
Threshold of about 0.25 seems to be the best
## Generating Result for other data
threshold = 0.27
xSubDF_Fil = xSubDF.loc[:,[col for col in xSubDF.columns.values if col != 'Index']]
ypred = xgbC.predict_proba(xSubDF_Fil)
ypred = [ ypred[i][1] for i in range(len(ypred)) ]
xSubDF.loc[:,'Class'] = ypred
xSubDF.loc[:,['Index', 'Class']].to_csv('result_xgb_probalistic_pred_Class.csv', sep='\t', index=False)
xSubDF.loc[:,'Class'] = [ 1 if ele > threshold else 0 for ele in ypred]
xSubDF.loc[:,['Index', 'Class']].to_csv('result_xgb_deterministic_pred_Class.csv', sep='\t', index=False)
Definition: Logistic regression is a machine learning algorithm for classification. In this algorithm, the probabilities describing the possible outcomes of a single trial are modelled using a logistic function.
Advantages: Logistic regression is designed for this purpose (classification), and is most useful for understanding the influence of several independent variables on a single outcome variable.
Disadvantages: Works only when the predicted variable is binary, assumes all predictors are independent of each other, and assumes data is free of missing values.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=1, random_state = seed)
lr.fit(xtrain, ytrain)
## Testing the Model
ypred = lr.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Definition: Naive Bayes algorithm based on Bayes’ theorem with the assumption of independence between every pair of features. Naive Bayes classifiers work well in many real-world situations such as document classification and spam filtering.
Advantages: This algorithm requires a small amount of training data to estimate the necessary parameters. Naive Bayes classifiers are extremely fast compared to more sophisticated methods.
Disadvantages: Naive Bayes is is known to be a bad estimator.
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(xtrain, ytrain)
## Testing the Model
ypred = nb.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
xSubDF.drop(columns='Class', inplace=True)
## Generating Result for other data
xSubDF_Fil = xSubDF.loc[:,[col for col in xSubDF.columns.values if col != 'Index']]
xSubDF.loc[:,'Class'] = nb.predict(xSubDF_Fil)
xSubDF.loc[:,['Index', 'Class']].to_csv('result_naiveBayes_deterministic_pred_Class.csv', sep='\t', index=False)
Definition: Stochastic gradient descent is a simple and very efficient approach to fit linear models. It is particularly useful when the number of samples is very large. It supports different loss functions and penalties for classification.
Advantages: Efficiency and ease of implementation.
Disadvantages: Requires a number of hyper-parameters and it is sensitive to feature scaling.
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='modified_huber', max_iter=None, tol=1e-3, shuffle=True, random_state=seed)
sgd.fit(xtrain, ytrain)
## Testing the Model
ypred = sgd.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Definition: Neighbours based classification is a type of lazy learning as it does not attempt to construct a general internal model, but simply stores instances of the training data. Classification is computed from a simple majority vote of the k nearest neighbours of each point.
Advantages: This algorithm is simple to implement, robust to noisy training data, and effective if training data is large.
Disadvantages: Need to determine the value of K and the computation cost is high as it needs to computer the distance of each instance to all the training samples.
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(xtrain, ytrain)
# Fitting K-NN to the Training set
# classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# classifier.fit(X_train, y_train)
## Testing the Model
ypred = knn.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Definition: Given a data of attributes together with its classes, a decision tree produces a sequence of rules that can be used to classify the data.
Advantages: Decision Tree is simple to understand and visualise, requires little data preparation, and can handle both numerical and categorical data.
Disadvantages: Decision tree can create complex trees that do not generalise well, and decision trees can be unstable because small variations in the data might result in a completely different tree being generated.
from sklearn.tree import DecisionTreeClassifier
# (criterion = 'entropy', random_state = 0)
dtree = DecisionTreeClassifier(random_state=seed)
dtree.fit(xtrain, ytrain)
## Testing the Model
ypred = dtree.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Definition: Random forest classifier is a meta-estimator that fits a number of decision trees on various sub-samples of datasets and uses average to improve the predictive accuracy of the model and controls over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement.
Advantages: Reduction in over-fitting and random forest classifier is more accurate than decision trees in most cases.
Disadvantages: Slow real time prediction, difficult to implement, and complex algorithm.
from sklearn.ensemble import RandomForestClassifier
# RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc = RandomForestClassifier(n_jobs=-1, random_state=seed)
rfc.fit(xtrain, ytrain)
## Testing the Model
ypred = dtree.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Pros AdaBoost is easy to implement. It iteratively corrects the mistakes of the weak classifier and improves accuracy by combining weak learners. You can use many base classifiers with AdaBoost. AdaBoost is not prone to overfitting. This can be found out via experiment results, but there is no concrete reason available.
Cons AdaBoost is sensitive to noise data. It is highly affected by outliers because it tries to fit each point perfectly. AdaBoost is slower compared to XGBoost.
from sklearn.ensemble import AdaBoostClassifier
adaBoost = AdaBoostClassifier(algorithm='SAMME.R', random_state=seed)
adaBoost.fit(xtrain, ytrain)
## Testing the Model
ypred = adaBoost.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
adaBoost = AdaBoostClassifier(algorithm='SAMME', random_state=seed)
adaBoost.fit(xtrain, ytrain)
## Testing the Model
ypred = adaBoost.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(xtrain, ytrain)
## Testing the Model
ypred = adaBoost.predict(xtest)
## On Test Data
print('_'*25+'On Test Data'+'_'*25)
plotConfusionMatrix(y_act=ytest, y_pred=ypred)
Definition: Support vector machine is a representation of the training data as points in space separated into categories by a clear gap that is as wide as possible. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall.
Advantages: Effective in high dimensional spaces and uses a subset of training points in the decision function so it is also memory efficient.
Disadvantages: The algorithm does not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation.
## Too Slow
# from sklearn.svm import SVC
# svm = SVC(kernel='linear', random_state=seed)
# svm.fit(xtrain, ytrain)
# ## Testing the Model
# ypred = svm.predict(xtest)
# ## On Test Data
# print('_'*25+'On Test Data'+'_'*25)
# plotConfusionMatrix(y_act=ytest, y_pred=ypred)
# # Fitting Kernel SVM to the Training set
# from sklearn.svm import SVC
# svm = SVC(kernel = 'rbf', random_state = 0)
# svm.fit(xtrain, ytrain)
AlgoToUse = 'PCA' ##
## Getting the Configuration for the Algorithm
AlgoConfig = '''{'n_components': None, 'copy': True, 'whiten': False, 'svd_solver': 'auto', 'tol':0.0, 'iterated_power':'auto', 'random_state':None }'''
## http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
## (n_components=None, copy=True, whiten=False, svd_solver=’auto’, tol=0.0, iterated_power=’auto’, random_state=None)
xtrain_DimTransf, Model = DimensionTransf(AlgoToUse, AlgoConfig, xtrain)
PlotExplainedVar(Model)
DetNoOfClusters(xtrain_DimTransf, AlgoToSelect='PCA transformed Feature')
DimenRed_Visual(xtrain_DimTransf, ytrain.reset_index(drop=True))
## Two Cluster
ClustAlgo = 'KMeans'
ClustAlgo_ParamConfig = '''{'n_clusters':2, 'init':'k-means++', 'n_init': 10, 'max_iter': 300, 'tol': 0.0001}'''
## (n_clusters=8, init=’k-means++’, n_init=10, max_iter=300, tol=0.0001, precompute_distances=’auto’, verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm=’auto’)
## algorithm : “auto”, “full” or “elkan”, default=”auto”
xtrain_ClustPred = ClusterDevelopment(ClustAlgo, ClustAlgo_ParamConfig, xtrain_DimTransf)
VisualizeClusters(xtrain_ClustPred, AlgoToUse, ClustAlgo)
## Evaluating Results
ypred = xtrain_ClustPred[xtrain_ClustPred.filter(like='_Predict').columns.values[0]]
# level_map = {1:0, 0:1}
# ypred = ypred.map(level_map)
plotConfusionMatrix(ytrain, ypred)
ClusterEvalScores = ComputingClusterEvalMetric(xtrain_DimTransf, ytrain, ypred)
# ClusterEvalScores['Algorithm'] = AlgoName
ClusterEvalScores
Cool, This unsupervised learning is performing the best in terms of finding the True Positive
xSubDF.drop(columns='Class', inplace=True)
## Generating Result for other data
xSubDF_Fil = xSubDF.loc[:,[col for col in xSubDF.columns.values if col != 'Index']]
AlgoToUse = 'PCA' ##
AlgoConfig = '''{'n_components': None, 'copy': True, 'whiten': False, 'svd_solver': 'auto', 'tol':0.0, 'iterated_power':'auto', 'random_state':None }'''
xSubDF_DimTransf, Model = DimensionTransf(AlgoToUse, AlgoConfig, xSubDF_Fil)
ClustAlgo = 'KMeans'
ClustAlgo_ParamConfig = '''{'n_clusters':2, 'init':'k-means++', 'n_init': 10, 'max_iter': 300, 'tol': 0.0001}'''
xSubDF_ClustPred = ClusterDevelopment(ClustAlgo, ClustAlgo_ParamConfig, xSubDF_DimTransf)
ypred = xtrain_ClustPred[xSubDF_ClustPred.filter(like='_Predict').columns.values[0]]
level_map = {1:0, 0:1} ## Cluster name can get reversed
xSubDF.loc[:,'Class'] = ypred.map(level_map)
xSubDF.loc[:,['Index', 'Class']].to_csv('result_KMeans_deterministic_pred_Class(label_maybe_inverse).csv', sep='\t', index=False)
''' Not Required -- will do later '''